{
clear_page(pte);
__make_page_readonly(pte);
+ queue_pte_pin(__pa(pte));
}
return pte;
static __inline__ void pte_free_slow(pte_t *pte)
{
+ queue_pte_unpin(__pa(pte));
__make_page_writable(pte);
free_page((unsigned long)pte);
}
#undef queue_invlpg
#undef queue_pgd_pin
#undef queue_pgd_unpin
+#undef queue_pte_pin
+#undef queue_pte_unpin
#undef queue_set_ldt
#endif
spin_lock_irqsave(&update_lock, flags);
update_queue[idx].ptr = phys_to_machine(ptr);
update_queue[idx].ptr |= MMU_EXTENDED_COMMAND;
- update_queue[idx].val = MMUEXT_PIN_TABLE;
+ update_queue[idx].val = MMUEXT_PIN_L2_TABLE;
increment_index();
spin_unlock_irqrestore(&update_lock, flags);
}
spin_unlock_irqrestore(&update_lock, flags);
}
+void queue_pte_pin(unsigned long ptr)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&update_lock, flags);
+ update_queue[idx].ptr = phys_to_machine(ptr);
+ update_queue[idx].ptr |= MMU_EXTENDED_COMMAND;
+ update_queue[idx].val = MMUEXT_PIN_L1_TABLE;
+ increment_index();
+ spin_unlock_irqrestore(&update_lock, flags);
+}
+
+void queue_pte_unpin(unsigned long ptr)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&update_lock, flags);
+ update_queue[idx].ptr = phys_to_machine(ptr);
+ update_queue[idx].ptr |= MMU_EXTENDED_COMMAND;
+ update_queue[idx].val = MMUEXT_UNPIN_TABLE;
+ increment_index();
+ spin_unlock_irqrestore(&update_lock, flags);
+}
+
void queue_set_ldt(unsigned long ptr, unsigned long len)
{
unsigned long flags;
spin_lock_irqsave(&update_lock, flags);
update_queue[idx].ptr = phys_to_machine(ptr);
update_queue[idx].ptr |= MMU_EXTENDED_COMMAND;
- update_queue[idx].val = MMUEXT_PIN_TABLE;
+ update_queue[idx].val = MMUEXT_PIN_L2_TABLE;
increment_index_and_flush();
spin_unlock_irqrestore(&update_lock, flags);
}
spin_unlock_irqrestore(&update_lock, flags);
}
+void xen_pte_pin(unsigned long ptr)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&update_lock, flags);
+ update_queue[idx].ptr = phys_to_machine(ptr);
+ update_queue[idx].ptr |= MMU_EXTENDED_COMMAND;
+ update_queue[idx].val = MMUEXT_PIN_L1_TABLE;
+ increment_index_and_flush();
+ spin_unlock_irqrestore(&update_lock, flags);
+}
+
+void xen_pte_unpin(unsigned long ptr)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&update_lock, flags);
+ update_queue[idx].ptr = phys_to_machine(ptr);
+ update_queue[idx].ptr |= MMU_EXTENDED_COMMAND;
+ update_queue[idx].val = MMUEXT_UNPIN_TABLE;
+ increment_index_and_flush();
+ spin_unlock_irqrestore(&update_lock, flags);
+}
+
void xen_set_ldt(unsigned long ptr, unsigned long len)
{
unsigned long flags;
void queue_invlpg(unsigned long ptr);
void queue_pgd_pin(unsigned long ptr);
void queue_pgd_unpin(unsigned long ptr);
+void queue_pte_pin(unsigned long ptr);
+void queue_pte_unpin(unsigned long ptr);
void queue_set_ldt(unsigned long ptr, unsigned long bytes);
void queue_machphys_update(unsigned long mfn, unsigned long pfn);
void xen_l1_entry_update(pte_t *ptr, unsigned long val);
void xen_invlpg(unsigned long ptr);
void xen_pgd_pin(unsigned long ptr);
void xen_pgd_unpin(unsigned long ptr);
+void xen_pte_pin(unsigned long ptr);
+void xen_pte_unpin(unsigned long ptr);
void xen_set_ldt(unsigned long ptr, unsigned long bytes);
void xen_machphys_update(unsigned long mfn, unsigned long pfn);
#define MMU_UPDATE_DEBUG 0
printk("PGD UNPIN %s %d: %08lx\n", __FILE__, __LINE__, (_p)); \
queue_pgd_unpin(_p); \
})
+#define queue_pte_pin(_p) ({ \
+ printk("PTE PIN %s %d: %08lx\n", __FILE__, __LINE__, (_p)); \
+ queue_pte_pin(_p); \
+})
+#define queue_pte_unpin(_p) ({ \
+ printk("PTE UNPIN %s %d: %08lx\n", __FILE__, __LINE__, (_p)); \
+ queue_pte_unpin(_p); \
+})
#define queue_set_ldt(_p,_l) ({ \
printk("SETL LDT %s %d: %08lx %d\n", __FILE__, __LINE__, (_p), (_l)); \
queue_set_ldt((_p), (_l)); \
* correct protection for the page
*/
if ( add_mmu_update(xc_handle, mmu,
- l2tab | MMU_EXTENDED_COMMAND, MMUEXT_PIN_TABLE) )
+ l2tab | MMU_EXTENDED_COMMAND, MMUEXT_PIN_L2_TABLE) )
goto error_out;
start_info = map_pfn_writeable(
*/
for ( i = 0; i < nr_pfns; i++ )
{
- if ( pfn_type[i] != (L2TAB|LPINTAB) )
- continue;
- if ( add_mmu_update(xc_handle, mmu,
- (pfn_to_mfn_table[i]<<PAGE_SHIFT) |
- MMU_EXTENDED_COMMAND,
- MMUEXT_PIN_TABLE) )
+ if ( pfn_type[i] == (L1TAB|LPINTAB) )
{
- printf("ERR pin L2 pfn=%lx mfn=%lx\n",
- (unsigned long)i, pfn_to_mfn_table[i]);
- goto out;
+ if ( add_mmu_update(xc_handle, mmu,
+ (pfn_to_mfn_table[i]<<PAGE_SHIFT) |
+ MMU_EXTENDED_COMMAND,
+ MMUEXT_PIN_L1_TABLE) ) {
+ printf("ERR pin L1 pfn=%lx mfn=%lx\n",
+ (unsigned long)i, pfn_to_mfn_table[i]);
+ goto out;
+ }
+ }
+ else if ( pfn_type[i] == (L2TAB|LPINTAB) )
+ {
+ if ( add_mmu_update(xc_handle, mmu,
+ (pfn_to_mfn_table[i]<<PAGE_SHIFT) |
+ MMU_EXTENDED_COMMAND,
+ MMUEXT_PIN_L2_TABLE) )
+ {
+ printf("ERR pin L2 pfn=%lx mfn=%lx\n",
+ (unsigned long)i, pfn_to_mfn_table[i]);
+ goto out;
+ }
}
}
* correct protection for the page
*/
if ( add_mmu_update(xc_handle, mmu,
- l2tab | MMU_EXTENDED_COMMAND, MMUEXT_PIN_TABLE) )
+ l2tab | MMU_EXTENDED_COMMAND, MMUEXT_PIN_L2_TABLE) )
goto error_out;
*virt_startinfo_addr =
/* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
static int
get_page_from_l2e(
- l2_pgentry_t l2e, unsigned long pfn,
- struct domain *d, unsigned long va_idx)
+ l2_pgentry_t l2e, unsigned long pfn, struct domain *d, unsigned long va_idx)
{
int rc;
rc = get_page_and_type_from_pagenr(
l2_pgentry_to_pagenr(l2e),
- PGT_l1_page_table | (va_idx<<PGT_va_shift), d);
+ PGT_l1_page_table | (va_idx<<PGT_va_shift), d);
if ( unlikely(!rc) )
return get_linear_pagetable(l2e, pfn, d);
return update_l2e(pl2e, ol2e, nl2e);
if ( unlikely(!get_page_from_l2e(nl2e, pfn, current,
- ((unsigned long)pl2e &
- ~PAGE_MASK) >> 2)) )
+ ((unsigned long)
+ pl2e & ~PAGE_MASK) >> 2 )) )
return 0;
if ( unlikely(!update_l2e(pl2e, ol2e, nl2e)) )
switch ( cmd )
{
- case MMUEXT_PIN_TABLE:
+ case MMUEXT_PIN_L1_TABLE:
+ case MMUEXT_PIN_L2_TABLE:
+
+ /* When we pin an L1 page we now insist that the va
+ backpointer (used for writable page tables) must still be
+ mutable. This is an additional restriction even for guests
+ that don't use writable page tables, but I don't think it
+ will break anything as guests typically pin pages before
+ they are used, hence they'll still be mutable. */
+
okay = get_page_and_type_from_pagenr(
- pfn, PGT_l2_page_table, FOREIGNDOM);
+ pfn,
+ ((cmd==MMUEXT_PIN_L2_TABLE) ?
+ PGT_l2_page_table : (PGT_l1_page_table | PGT_va_mutable) ) ,
+ FOREIGNDOM);
if ( unlikely(!okay) )
{
unsigned long prev_spfn = 0;
l1_pgentry_t *prev_spl1e = 0;
struct domain *d = current;
- u32 type_info;
perfc_incrc(calls_to_mmu_update);
perfc_addc(num_page_updates, count);
}
page = &frame_table[pfn];
- switch ( (type_info = page->u.inuse.type_info) & PGT_type_mask )
+ switch ( (page->u.inuse.type_info & PGT_type_mask) )
{
case PGT_l1_page_table:
- if ( likely(get_page_type(
- page, type_info & (PGT_type_mask|PGT_va_mask))) )
+ if ( likely(passive_get_page_type(page, PGT_l1_page_table)) )
{
okay = mod_l1_entry((l1_pgentry_t *)va,
mk_l1_pgentry(req.val));
[ptwr_info[cpu].writable_l1>>PAGE_SHIFT];
#ifdef PTWR_TRACK_DOMAIN
- if (ptwr_domain[cpu] != current->domain)
+ if (ptwr_domain[cpu] != get_current()->domain)
printk("ptwr_reconnect_disconnected domain mismatch %d != %d\n",
- ptwr_domain[cpu], current->domain);
+ ptwr_domain[cpu], get_current()->domain);
#endif
- PTWR_PRINTK(("[A] page fault in disconn space: addr %08lx space %08lx\n",
+ PTWR_PRINTK(("[A] page fault in disconnected space: addr %08lx space %08lx\n",
addr, ptwr_info[cpu].disconnected << L2_PAGETABLE_SHIFT));
pl2e = &linear_l2_table[ptwr_info[cpu].disconnected];
int i, idx;
#ifdef PTWR_TRACK_DOMAIN
- if (ptwr_info[cpu].domain != current->domain)
+ if (ptwr_info[cpu].domain != get_current()->domain)
printk("ptwr_flush_inactive domain mismatch %d != %d\n",
- ptwr_info[cpu].domain, current->domain);
+ ptwr_info[cpu].domain, get_current()->domain);
#endif
#if 0
{
if ( (page->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table )
{
#ifdef PTWR_TRACK_DOMAIN
- if ( ptwr_info[cpu].domain != current->domain )
+ if ( ptwr_info[cpu].domain != get_current()->domain )
printk("ptwr_do_page_fault domain mismatch %d != %d\n",
- ptwr_info[cpu].domain, current->domain);
+ ptwr_info[cpu].domain, get_current()->domain);
#endif
pl2e = &linear_l2_table[(page->u.inuse.type_info &
PGT_va_mask) >> PGT_va_shift];
cleanup_writable_pagetable(
prev, PTWR_CLEANUP_ACTIVE | PTWR_CLEANUP_INACTIVE);
+#ifdef PTWR_TRACK_DOMAIN
+ {
+ extern domid_t ptwr_domain[];
+ int cpu = smp_processor_id();
+ if (ptwr_domain[cpu] != prev->domain)
+ printk("switch_to domain mismatch %d != %d\n",
+ ptwr_domain[cpu], prev->domain);
+ ptwr_domain[cpu] = next->domain;
+ if (ptwr_disconnected[cpu] != ENTRIES_PER_L2_PAGETABLE ||
+ ptwr_writable_idx[cpu])
+ printk("switch_to ptwr dirty!!!\n");
+ }
+#endif
+
perfc_incrc(sched_ctx);
#if defined(WAKE_HISTO)
/* Has this page been validated for use as its current type? */
#define _PGT_validated 28
#define PGT_validated (1<<_PGT_validated)
- /* The 10 most significant bits of virt address if this is a L1 page table. */
+ /* 10-bit most significant bits of va address if used as l1 page table */
#define PGT_va_shift 18
#define PGT_va_mask (((1<<10)-1)<<PGT_va_shift)
+#define PGT_va_mutable PGT_va_mask /* va backpointer is still mutable */
/* 18-bit count of uses of this frame as its current type. */
#define PGT_count_mask ((1<<18)-1)
nx &= ~PGT_validated;
}
}
+ else if ( unlikely( ((nx & PGT_count_mask) == 1) &&
+ test_bit(_PGC_guest_pinned, &page->count_info)) )
+ {
+ /* if the page is pinned, but we're dropping the last reference
+ then make the va backpointer mutable again */
+ nx |= PGT_va_mutable;
+ }
}
while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
}
nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
nx |= type;
/* No extra validation needed for writable pages. */
- if ( type == PGT_writable_page )
+ if ( (type & PGT_type_mask) == PGT_writable_page )
nx |= PGT_validated;
}
}
- else if ( unlikely((x & (PGT_type_mask|PGT_va_mask)) != type) )
+ else if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
{
- DPRINTK("Unexpected type or va backptr (saw %08x != exp %08x) "
- "for pfn %08lx\n",
- x & (PGT_type_mask|PGT_va_mask), type, page_to_pfn(page));
+ DPRINTK("Unexpected type (saw %08x != exp %08x) for pfn %08lx\n",
+ x & PGT_type_mask, type, page_to_pfn(page));
+ return 0;
+ }
+ else if ( (x & PGT_va_mask) == PGT_va_mutable )
+ {
+ /* The va_backpointer is currently mutable, hence we update it. */
+ nx &= ~PGT_va_mask;
+ nx |= type; /* we know the actual type is correct */
+ }
+ else if ( unlikely((x & PGT_va_mask) != (type & PGT_va_mask) ) )
+ {
+ /* The va backpointer wasn't mutable, and is different :-( */
+ DPRINTK("Unexpected va backpointer (saw %08x != exp %08x) for pfn %08lx\n",
+ x, type, page_to_pfn(page));
return 0;
}
else if ( unlikely(!(x & PGT_validated)) )
return 1;
}
+/* This 'passive' version of get_page_type doesn't attempt to validate
+the page, but just checks the type and increments the type count. The
+function is called while doing a NORMAL_PT_UPDATE of an entry in an L1
+page table: We want to 'lock' the page for the brief beriod while
+we're doing the update, but we're not actually linking it in to a
+pagetable. */
+
+static inline int passive_get_page_type(struct pfn_info *page, u32 type)
+{
+ u32 nx, x, y = page->u.inuse.type_info;
+ again:
+ do {
+ x = y;
+ nx = x + 1;
+ if ( unlikely((nx & PGT_count_mask) == 0) )
+ {
+ DPRINTK("Type count overflow on pfn %08lx\n", page_to_pfn(page));
+ return 0;
+ }
+ else if ( unlikely((x & PGT_count_mask) == 0) )
+ {
+ if ( (x & (PGT_type_mask|PGT_va_mask)) != type )
+ {
+ nx &= ~(PGT_type_mask | PGT_va_mask | PGT_validated);
+ nx |= type;
+ }
+ }
+ else if ( unlikely((x & PGT_type_mask) != (type & PGT_type_mask) ) )
+ {
+ DPRINTK("Unexpected type (saw %08x != exp %08x) for pfn %08lx\n",
+ x & PGT_type_mask, type, page_to_pfn(page));
+ return 0;
+ }
+ else if ( unlikely(!(x & PGT_validated)) )
+ {
+ /* Someone else is updating validation of this page. Wait... */
+ while ( (y = page->u.inuse.type_info) != x )
+ {
+ rep_nop();
+ barrier();
+ }
+ goto again;
+ }
+ }
+ while ( unlikely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x) );
+
+ return 1;
+}
+
static inline void put_page_and_type(struct pfn_info *page)
{
* ptr[1:0] == MMU_EXTENDED_COMMAND:
* val[7:0] -- MMUEXT_* command.
*
- * val[7:0] == MMUEXT_[UN]PIN_TABLE:
- * ptr[:2] -- Machine address of frame to be (un)pinned as a top-level p.t.
- * page. The frame must belong to the FD, if one is specified.
+ * val[7:0] == MMUEXT_(UN)PIN_*_TABLE:
+ * ptr[:2] -- Machine address of frame to be (un)pinned as a p.t. page.
+ * The frame must belong to the FD, if one is specified.
*
* val[7:0] == MMUEXT_NEW_BASEPTR:
* ptr[:2] -- Machine address of new page-table base to install in MMU.
#define MMU_NORMAL_PT_UPDATE 0 /* checked '*ptr = val'. ptr is MA. */
#define MMU_MACHPHYS_UPDATE 2 /* ptr = MA of frame to modify entry for */
#define MMU_EXTENDED_COMMAND 3 /* least 8 bits of val demux further */
-#define MMUEXT_PIN_TABLE 0 /* ptr = MA of frame to pin */
+#define MMUEXT_PIN_L1_TABLE 0 /* ptr = MA of frame to pin */
+#define MMUEXT_PIN_L2_TABLE 1 /* ptr = MA of frame to pin */
+#define MMUEXT_PIN_L3_TABLE 2 /* ptr = MA of frame to pin */
+#define MMUEXT_PIN_L4_TABLE 3 /* ptr = MA of frame to pin */
#define MMUEXT_UNPIN_TABLE 1 /* ptr = MA of frame to unpin */
#define MMUEXT_NEW_BASEPTR 2 /* ptr = MA of new pagetable base */
#define MMUEXT_TLB_FLUSH 3 /* ptr = NULL */